WANDB_PROJECT="nanot5-base-malaysian-cased-translation-v4-multipack" \
/root/app/venv/bin/torchrun \
--nproc_per_node 1 \
-m run_t5_multipack \
--model_name_or_path mesolitica/nanot5-base-malaysian-translation-v2 \
--num_train_epochs 2 \
--eval_steps 1000000000 \
--logging_steps 2 \
--save_steps 200 \
--save_total_limit 3 \
--do_train \
--train_file malaysian-translation-v2-multipack-2048 \
--output_dir nanot5-base-malaysian-cased-translation-v4-packing \
--dataloader_num_workers=5 \
--per_device_train_batch_size=1 \
--per_device_eval_batch_size=3 \
--gradient_accumulation_steps=16 \
--max_source_length 2048 \
--max_target_length 2048 \
--learning_rate 2e-5 \
--gradient_checkpointing false \
--weight_decay 0.01 \
--bf16 \
--ddp_find_unused_parameters true \
--dataloader_pin_memory false